#!/bin/ksh

#
# Validate links that are found from a passed top level URL
#

LOGFILE=/tmp/valid_url.log
if [ -f $LOGFILE ]
then
  rm $LOGFILE
fi

function url_feeder
{
  while read
  do
    [[ $REPLY = "GO" ]] && break
  done

  for url in $*
  do
    print $url
  done
}

function find_urls
{
  url=$1
  urls=`lynx -dump $url | sed -n '/^References/,$p' | egrep -v "ftp://|javascript:|mailto:|news:|https://" | tail -n +3 | awk '{print $2}' | cut -d\? -f1 | sort -u`

  urlcount=`echo $urls | wc -w`

  if [ "$urls" = "" ]
  then
    if [ "$2" != "" ]
    then
      echo $url Link Not Found on Server or Forbidden >> $LOGFILE
    else
      echo ""
    fi
  elif [ $urlcount -eq 1 -a "$urls" = "http://www.com.org/home.php" ]
  then
    if [ "$2" != "" ]
    then
      echo $url Site Not Found >> $LOGFILE
    else
      echo ""
    fi
  else
    if [ "$2" != "" ]
    then
      echo "$url is valid" >> $LOGFILE
    else
      echo " $urls"
    fi
  fi
}

first_url=$1

OPTIND=1
while getopts l:u:p: ARGS
do
  case $ARGS in
    l) levels=$OPTARG
    ;;
    u) totalurls=$OPTARG
    ;;
    p) max_parallel=$OPTARG
    ;;
    *) echo "Usage: $0 -l [levels to look] -u [url] -p [parallel checks]"
       exit
    ;;
  esac
done

while [ $levels -ne 0 ]
do
  (( levels -= 1 ))
  for url in $totalurls
  do
    totalurls="${totalurls}`find_urls $url`"
    url_count=`echo $totalurls | wc -w`
    echo Current total number of urls is: $url_count
  done
done

totalurls=`for url in $totalurls
do
  echo $url
done | sort -u`

url_count=`echo $totalurls | wc -w`
echo Final unique total number of urls is: $url_count

url_feeder $totalurls |&
coprocess_pid=$!

processed_urls=0
parallel_jobs=0

print -p "GO"

while [ $processed_urls -lt $url_count ]
do
  unset parallel_pids
  while [ $parallel_jobs -lt $max_parallel ]
  do
    if [ $(($processed_urls+$parallel_jobs)) -ge $url_count ]
    then
      break
    fi
    read -p url
    find_urls $url v &
    parallel_pids="$parallel_pids $!"
    parallel_jobs=$(($parallel_jobs+1))
  done
  wait $parallel_pids
  processed_urls=$(($processed_urls+$parallel_jobs))
  echo Processed $processed_urls URLs
  parallel_jobs=0
done

wait $coprocess_pid
